# 检查你的Python版本
from sys import version_info
if version_info.major != 2 and version_info.minor != 7:
raise Exception('请使用Python 2.7来完成此项目')
# 引入这个项目需要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display # 使得我们可以对DataFrame使用display()函数
import xgboost as xgb
import time
from sklearn.cross_validation import train_test_split
# 设置以内联的形式显示matplotlib绘制的图片(在notebook中显示更美观)
%matplotlib inline
# 载入客户数据集
try:
types={
# 'Store': np.dtype(obj),
# 'DayOfWeek': np.dtype(str),
'Date': np.dtype(str),
'Sales': np.dtype(float),
'Customers': np.dtype(int),
# 'Open': np.dtype(int),
'Promo': np.dtype(int),
# 'StateHoliday': np.dtype(str),
'SchoolHoliday': np.dtype(int),
'StoreType': np.dtype(str),
'Assortment': np.dtype(str),
'CompetitionDistance': np.dtype(float),
'CompetitionOpenSinceYear': np.dtype(int),
'CompetitionOpenSinceMonth': np.dtype(int),
'Promo2': np.dtype(int),
'Promo2SinceWeek': np.dtype(int),
'Promo2SinceYear': np.dtype(int),
'PromoInterval': np.dtype(str)
}
df_train = pd.read_csv("./train.csv",parse_dates=['Date'],date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')),dtype=types)
df_test = pd.read_csv("./test.csv",parse_dates=['Date'],date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')),dtype=types)
df_store = pd.read_csv("./store.csv")
except:
print "Dataset could not be loaded. Is the dataset missing?"
#查找缺失值
for i in df_train,df_test,df_store:
print i.shape
print i.isnull().sum()
print ""
df_store['Promo2SinceWeek'].astype('category').cat.categories
#对df_train df_test 设置变量,并且合并,便于查找
df_train['Train'] = 1
df_train['Test'] = 0
df_test['Train'] = 0
df_test['Test'] = 1
df_raw = pd.concat([df_train, df_test],sort=False)
#显示合并后数据属性
df_raw.info()
#显示df_store数据属性
df_store.info()
#将测试集open缺省值都记为1
df_raw.loc[df_raw.Open.isnull(), 'Open'] = 1
#对各项数据进行counter发现有所有异常的列
from collections import Counter
def show_me_options(data,index):
return str(index) + ":" + str(Counter(data[index]))
show_me_options(df_store,'PromoInterval')
df_raw.Open = df_raw.Open.astype(int)
#将开店状态为1的,但是销售为0的剔除在训练外
df_raw.loc[((df_raw['Open'] == 1) & (df_raw['Sales'] == 0)),'Train'] = 0
#将stateholiday特征转化为int
df_raw['StateHoliday'] = df_raw['StateHoliday'].astype('category').cat.codes
show_me_options(df_raw,'StateHoliday')
#将StoreType特征和Assortment特征转化为int
df_store['StoreType'] = df_store['StoreType'].astype('category').cat.codes
df_store['Assortment'] = df_store['Assortment'].astype('category').cat.codes
show_me_options(df_store,'StoreType')
show_me_options(df_store,'Assortment')
show_me_options(df_store,'PromoInterval')
df_store['PromoInterval'].astype('category').cat.categories
df_store['PromoInterval'] = df_store['PromoInterval'].astype('category').cat.codes
#PromoInterval处理df_store的PromoInterval特征
df_store['PromoInterval'].astype('category').cat.categories
#PromoInterval处理df_store的PromoInterval特征
df_store['PromoInterval'] = df_store['PromoInterval'].astype('category').cat.rename_categories([-1,2,1,0])
df_store.loc[(df_store['PromoInterval']==-1),'PromoInterval']=np.nan
df_store.columns
#将competitionOpen转化为Timestamp
def CompetionOpenTrans2TS(df):
try:
date = '{}-{}'.format(int(df['CompetitionOpenSinceYear']), int(df['CompetitionOpenSinceMonth']))
return time.mktime(time.strptime(date, '%Y-%m'))
except:
return np.nan
df_store['CompetitionOpenTS'] = df_store.apply(lambda df_store: CompetionOpenTrans2TS(df_store), axis=1)
#将Promo2转化为Timestamp
def Promo2Trans2TS(df_store):
try:
date = '{}-{}'.format(int(df_store['Promo2SinceYear']), int(df_store['Promo2SinceWeek']))
return time.mktime(time.strptime(date, '%Y-%W'))
except:
return np.nan
df_store['Promo2TS'] = df_store.apply(lambda df_store: Promo2Trans2TS(df_store), axis=1)
#对每个store的sales, customer 和open 求和
data_sales = df_raw.groupby([df_raw['Store']])['Sales'].sum()
data_customers = df_raw.groupby([df_raw['Store']])['Customers'].sum()
data_open = df_raw.groupby([df_raw['Store']])['Open'].count()
#进行计算
data_sales_per_day = data_sales / data_open
data_customers_per_day = data_customers / data_open
data_sales_per_customer_per_day = data_sales_per_day /data_customers_per_day
df_store = pd.merge(df_store, data_sales_per_day.reset_index(name='SalesPerDay'), how='left', on=['Store'])
df_store = pd.merge(df_store, data_customers_per_day.reset_index(name='CustomersPerDay'), how='left', on=['Store'])
df_store = pd.merge(df_store, data_sales_per_customer_per_day.reset_index(name='SalesPerCustomersPerDay'), how='left', on=['Store'])
df_store.columns
features_store=['Store','StoreType','Assortment','CompetitionDistance','Promo2',\
'PromoInterval','CompetitionOpenTS','Promo2TS','SalesPerDay',\
'CustomersPerDay','SalesPerCustomersPerDay']
train = pd.merge(df_raw,df_store[features_store], on='Store')
# test = pd.merge(df_test,df_store, on='Store')
def checkpromo(df):
try:
if int(df['Month']%3) == int(df['PromoInterval']):
return 1
else:
return 0
except:
return np.nan
#将Promo2转化为Timestamp
def dateTrans2TS(train):
try:
date = '{}-{}-{}'.format(int(train['Year']), int(train['Month']),int(train['Day']))
return time.mktime(time.strptime(date, '%Y-%m-%d'))
except:
return np.nan
features=[]
#将可以直接用的特征放入features
features.extend(['Store', 'Promo', 'SchoolHoliday', 'StateHoliday', 'StoreType', 'Assortment', 'Promo2', 'CompetitionDistance',\
'CompetitionOpenTS','Promo2TS','SalesPerDay','CustomersPerDay','SalesPerCustomersPerDay','IsPromoMonth'])
#放入时间相关信息
features.extend(['DayOfWeek', 'Month', 'Day', 'Year','WeekOfYear','DateTS'])
train['Year'] = train.Date.dt.year
train['Month'] = train.Date.dt.month
train['Day'] = train.Date.dt.day
train['WeekOfYear'] = train.Date.dt.weekofyear
train['DateTS'] = train.apply(lambda train: dateTrans2TS(train), axis=1)
# 检查日期是否在促销月
train['IsPromoMonth']=train.apply(lambda train: checkpromo(train), axis=1)
print(features)
train.columns
# 销售额均值-按星期几排序
sns.factorplot(data = train, x = 'Month', y = "Sales",
col = 'DayOfWeek',
palette = 'plasma',
hue = 'DayOfWeek')
# 销售额-按月份
sns.factorplot(data = train, x = 'StoreType', y = "Sales",
col = 'Month',
palette = 'plasma',
hue = 'Month')
# 销售额-按店类型
sns.factorplot(data = train, x = 'Month', y = "Sales",
col = 'StoreType',
palette = 'plasma',
hue = 'StoreType')
# 销售额均值-对比有无促销(Promo)
sns.factorplot(data = train, x = 'Month', y = "Sales",
col = 'Promo',
palette = 'plasma',
hue = 'Promo',
)
# 销售额均值-对比StoreType在促销的情况下
sns.factorplot(data = train, x = 'Month', y = "Sales",
col = 'Promo',
palette = 'plasma',
hue = 'StoreType')
#手动排查异常数据
a=126
list_stores_to_check = range(a,a+5,1)
# 105,126,163,172,259,274,339,349,353,364,378,404,512,517,523,560,589,663,673,676,681,700,733,762,764,769,816,824,837,845,861,925,940,969,986
#991,1039,1068,1097,1115
plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]
j = 1
for i in list_stores_to_check:
store = i
# Normal sales
X1 = train.loc[(train['Store'] == store) & (train['Open'] == 1)]
y1 = train.loc[(train['Store'] == store) & (train['Open'] == 1)]['Sales']
Xt = train.loc[(train['Store'] == store)]
plt.subplot(len(list_stores_to_check),1,j)
plt.plot(X1['Date'], y1, '-')
plt.minorticks_on()
plt.grid(True, which='both')
plt.title(i)
j += 1
#手动排查异常数据
list_stores_to_check = [105,126,163,172,259,274,339,349,353,364,378,404,512,517,523,560,589,663,673,676]
# 105,126,163,172,259,274,339,349,353,364,378,404,512,517,523,560,589,663,673,676,681,700,733,762,764,769,816,824,837,845,861,925,940,969,986
#991,1039,1068,1097,1115
plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]
j = 1
for i in list_stores_to_check:
store = i
# Normal sales
X1 = train.loc[(train['Store'] == store) & (train['Open'] == 1)]
y1 = train.loc[(train['Store'] == store) & (train['Open'] == 1)]['Sales']
Xt = train.loc[(train['Store'] == store)]
plt.subplot(len(list_stores_to_check),1,j)
plt.plot(X1['DateTS'], y1, '-')
plt.minorticks_on()
plt.grid(True, which='both')
plt.title(i)
j += 1
#手动排查异常数据
a=1100
list_stores_to_check = [681,700,733,762,764,769,816,824,837,845,861,925,940,969,986,991,1068,1097,1115]
# 105,126,163,172,259,274,339,349,353,364,378,404,512,517,523,560,589,663,673,676,681,700,733,762,764,769,816,824,837,845,861,925,940,969,986
#991,1039,1068,1097,1115
plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]
j = 1
for i in list_stores_to_check:
store = i
# Normal sales
X1 = train.loc[(train['Store'] == store) & (train['Open'] == 1)]
y1 = train.loc[(train['Store'] == store) & (train['Open'] == 1)]['Sales']
Xt = train.loc[(train['Store'] == store)]
plt.subplot(len(list_stores_to_check),1,j)
plt.plot(X1['DateTS'], y1, '-')
plt.minorticks_on()
plt.grid(True, which='both')
plt.title(i)
j += 1
store_dates_to_remove = {105:1.368e09,
126:1.384e09,
163:1.366e09,
172:1.366e09,
259:1.362e09,
274:1.362e09,
339:1.362e09,
349:1.368e09,
353:1.364e09,
364:1.37e09,
378:1.388e09,
404:1.36e09,
512:1.36e09,
517:1.362e09,
523:1.382e09,
560:1.362e09,
589:1.368e09,
663:1.382e09,
673:1.364e09,
676:1.366e09,
681:1.37e09,
700:1.372e09,
733:1.362e09,
762:1.36e09,
764:1.366e09,
769:1.362e09,
816:1.372e09,
824:1.378e09,
837:1.394e09,
845:1.364e09,
861:1.364e09,
925:1.362e09,
940:1.362e09,
969:1.362e09,
986:1.366e09,
991:1.362e09,
1068:1.36e09,
1097:1.36e09,
1115:1.364e09}
for key,value in store_dates_to_remove.iteritems():
train.loc[(train['Store'] == key) & (train['DateTS'] < value), 'Train'] = 0
list_stores_to_check = [105,126,163,172,259,274,339,349,353,364,378,404,512,517,523,560,589,663,673,676]
plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]
j = 1
for i in list_stores_to_check:
stor = i
# Normal sales
X1 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 1)]
y1 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 1)]['Sales']
X2 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 0)]
y2 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 0)]['Sales']
Xt = train.loc[(train['Store'] == stor)]
plt.subplot(len(list_stores_to_check),1,j)
plt.plot(X1['DateTS'], y1, '-')
plt.plot(X2['DateTS'], y2, 'r')
plt.minorticks_on()
plt.grid(True, which='both')
plt.title(i)
j += 1
list_stores_to_check = [681,700,733,762,764,769,816,824,837,845,861,925,940,969,986,991,1068,1097,1115]
plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]
j = 1
for i in list_stores_to_check:
stor = i
# Normal sales
X1 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 1)]
y1 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 1)]['Sales']
X2 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 0)]
y2 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Train'] == 0)]['Sales']
Xt = train.loc[(train['Store'] == stor)]
plt.subplot(len(list_stores_to_check),1,j)
plt.plot(X1['DateTS'], y1, '-')
plt.plot(X2['DateTS'], y2, 'r')
plt.minorticks_on()
plt.grid(True, which='both')
plt.title(i)
j += 1
def check_outlier(points, t):
if len(points.shape) == 1:
points = points[:,None]
median = np.median(points, axis=0)
diff = np.sum((points - median)**2, axis=-1)
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return modified_z_score > t
train.loc[(train['Store'] == 1) & (train['Train'] == 1)]
for i in train['Store'].unique():
train.loc[(train['Store'] == i) & (train['Test'] == 0) & (train['Open'] == 1), 'Outlier'] = \
check_outlier(train.loc[(train['Store'] == i) & (train['Test'] == 0)& (train['Open'] == 1)]['Sales'], 3.5)
train.loc[(train['Test'] == 0)&(train['Outlier']==True)].shape
b = 1
plt.rcParams["figure.figsize"] = [20,10*5]
for i in range(b,b+25,1):
stor = i
# Normal sales
X1 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Outlier'] == False)]
y1 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Outlier'] == False)]['Sales']
# Outliers
X2 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Outlier'] == True)]
y2 = train.loc[(train['Store'] == stor) & (train['Open'] == 1) & (train['Outlier'] == True)]['Sales']
Xt = train.loc[(train['Store'] == stor)]
plt.subplot(10,5,i)
plt.plot(X1['Date'], y1, '-')
plt.plot(X2['Date'], y2, 'r.')
plt.title(i)
plt.axis('on')
def rmspe(preds,y):
return np.sqrt(np.mean(((y - preds)/y) ** 2))
def rmspe_exp(preds,y):
y = np.expm1(y.get_label())
preds = np.expm1(preds)
return "rmspe", rmspe(preds,y)
params = {"objective": "gpu:reg:linear",
"booster" : "gbtree",
"eta": 0.01,
"max_depth": 12,
"subsample": 0.7,
"colsample_bytree": 0.5,
# "min_child_weight": 1,
"silent": 1,
"seed": 42,
'nthread':6,
# "tree_method":"gpu_hist"
}
num_boost_round = 20000
print("训练准备完成")
# X_train = train.loc[(train['Outlier'] == False) & (train['Delete'] != False) ]
# X_valid = valid.loc[(valid['Outlier'] == False) & (valid['Delete'] != False) ]
# y_train = np.log(X_train.Sales)
# y_valid = np.log(X_valid.Sales)
X_train, X_valid, y_train, y_valid = train_test_split(train.loc[(train['Train'] == 1) & (train['Open'] == 1) & (train['Outlier'] == False)][features],
np.log1p(train.loc[(train['Train'] == 1) & (train['Open'] == 1) & (train['Outlier'] == False)].Sales),
test_size=0.1)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)
evallist = [(dtrain, 'train'), (dvalid, 'eval')]
evals_results={}
print features
gbm = xgb.train(params, dtrain, num_boost_round, evals=evallist,evals_result=evals_results, early_stopping_rounds=200, \
feval=rmspe_exp, verbose_eval=50)
import winsound
winsound.Beep(400, 1000)
plt.rcParams["figure.figsize"] = [10,5]
# 训练过程可视化
plt.plot(evals_results['train']['rmspe'][400:])
plt.plot(evals_results['eval']['rmspe'][400:])
plt.title('model accuracy')
plt.ylabel('rmspe_value')
plt.xlabel('num_boost_round')
plt.legend(['train', 'eval'], loc='upper left')
plt.savefig("model1.png")
print "生成测试集"
dtest = xgb.DMatrix(train.loc[(train['Test'] == 1)][features])
test_probs = gbm.predict(dtest)
result = pd.DataFrame({"Id": train.loc[(train['Test'] == 1)]['Id'].astype('int'), 'Sales': np.expm1(test_probs)*0.965})
result.to_csv("xgboost_submission10.csv", index=False)
gbm.save_model('final.model')
plt.rcParams["figure.figsize"] = [15,15]
xgb.plot_importance(gbm,ax=None, height=0.2, xlim=None, ylim=None, title='Feature importance', xlabel='F score', ylabel='Features', importance_type='weight', max_num_features=None, grid=True, show_values=False)
np.exp(np.log(5000))
np.expm1(np.log1p(5000))